This file simulates the data I anticipate for the coordinated analysis that will be my dissertation.
metadata <- tibble(
study = c("new_moms", "deception_detection", "karyn_diss", "murat_rep",
"mideast_men", "stem", "barter", "double_empathy"),
targets = c(20, 95, 212, 200,
9, 59, 310, 8),
perceivers = c(60, 95, 212, 200,
326, 121, 310, 100),
videos = c(20, 95, 318, 300,
9, 121, 155, 8),
paradigm = c("ss", "di", "di", "di",
"ss", "di", "di", "ss") %>%
factor(levels = c("ss", "di"), labels = c("Standard Stimulus",
"Dyadic Interaction")),
inference_schedule = c("Variable", "Variable", "Set", "Set",
"Variable", "Set", "Variable", "Set") %>%
as.factor()
)
vrm <- c("Disclosure", "Edification", "Advisement", "Confirmation", "Question", "Acknowledgment", "Interpretation", "Reflection")
generate_random_number <- function(mean = 8, sd = 3, min = 3, max = 19, digits = 0) {
random_number <- NA
while (is.na(random_number) || random_number < min || random_number > max) {
random_number <- round(rnorm(1, mean = mean, sd = sd), digits)
}
return(random_number)
}
multiply_out <- function(df, n_column, column_name) {
df_expanded <- df %>%
rowwise() %>%
mutate(!!column_name := list(seq_len(!!sym(n_column)))) %>%
unnest(cols = !!sym(column_name))
return(df_expanded)
}
SimulateStudy <- function(study_name, paradigm, seed = 123, n_perceivers = 1, n_videos_per_perceiver = 1){
set.seed(seed)
# Filter for current study
study_data <- metadata %>%
filter(study == study_name)
# Simulate number of chapters within each video
df = tibble(
name = paste0(study_name, "_", 1:study_data$videos),
n_video = 1:study_data$videos,
n_chapter = NA
)
for(i in seq_len(study_data$videos)){
df$n_chapter[i] <- generate_random_number()
}
df <- multiply_out(df, n_column = "n_chapter", column_name = "chapter")
# Simulate number of turns within each chapter
for(i in seq_len(study_data$videos)){
df$n_turns[i] <- generate_random_number(mean = 11, sd = 6,
min = 4, max = 40)
}
df <- multiply_out(df, n_column = "n_turns", column_name = "turn")
# STIMULUS LEVEL VARIABLES
df <- df %>%
group_by(name, chapter) %>%
mutate(
chapter_length = generate_random_number(mean = 45, sd = 6,
min = 18, max = 120,
digits = 3),
turn_length = {raw_turn_lengths <- runif(n(), min = 4, max = 40)
scaled_turn_lengths <- raw_turn_lengths / sum(raw_turn_lengths) *
chapter_length
round(scaled_turn_lengths, 3)
},
start_time = cumsum(lag(turn_length, default = 0)),
end_time = cumsum(turn_length),
turns_from_inference = n() - row_number() + 1,
turn_percent_through_chapter = (row_number() / n()) * 100,
time_percent_through_chapter = end_time/chapter_length * 100,
speaker = ifelse(rep(sample(c(TRUE, FALSE), 1), n()),
rep(c("Partner", "Target"), length.out = n()),
rep(c("Target", "Partner"), length.out = n())) %>%
factor(),
sem_sim = {
repeat {
base_random <- runif(n(), min = -1.00, max = 1.00)
weight <- ifelse(speaker == "Partner",
((turn_percent_through_chapter - 1) / 180)^2,
((turn_percent_through_chapter - 1) / 120)^2)
noise <- ifelse(speaker == "Partner",
rnorm(n(), mean = 0, sd = 0.3),
rnorm(n(), mean = 0, sd = 0.1))
sem_sim_raw <- base_random * (1 - weight) + 1 * weight + noise
if (sum(sem_sim_raw <= -0.99 | sem_sim_raw >= 0.99) / n() < 0.05) {
break
}
}
pmin(pmax(sem_sim_raw, -1.00), 1.00)
},
cog_processing_language = sem_sim + rnorm(n(), mean = 0, sd = sqrt(1 - 0.45^2)),
memory_language = sem_sim * 0.20 + rnorm(n(), mean = 0, sd = sqrt(1 - 0.20^2)),
emo_anxious_language = sem_sim * 0.10 + rnorm(n(), mean = 0, sd = sqrt(1 - 0.04^2)),
emo_sad_language = sem_sim * 0.15 + rnorm(n(), mean = 0, sd = sqrt(1 - 0.02^2)),
emo_anger_language = sem_sim * 0.19 + rnorm(n(), mean = 0, sd = sqrt(1 - 0.09^2)),
certain_language = sem_sim * 0.17 + rnorm(n(), mean = 0, sd = sqrt(1 - 0.07^2)),
self_ref_language = sem_sim * 0.21 + rnorm(n(), mean = 0, sd = sqrt(1 - 0.21^2)),
curious_language = sem_sim * 0.10 + rnorm(n(), mean = 0, sd = sqrt(1 - 0.10^2)),
vrm = sample(vrm, n(), replace = TRUE)
)
# PARTICIPANT-LEVEL VARIABLES
if(paradigm == "DI"){
df <- df %>%
mutate(
target = paste0(name, "_target_", n_video),
perceiver = paste0(name, "_perceiver_", n_video),
partner = paste0(name, "_partner_", n_video),
paradigm = "Dyadic Interaction"
)
} else if (paradigm == "SS"){
# have to double-up on the naming because nesting removes the grouping column
df <- df %>%
mutate(
name2 = name
)
df_list <- df %>%
group_by(name) %>%
nest()
out_list <- list()
for(i in seq_len(n_perceivers)){
df_i <- sample(df_list$data, n_videos_per_perceiver) %>%
bind_rows()
df_i <- df_i %>%
mutate(
target = paste0(name2, "_target_", n_video),
perceiver = paste0(name2, "_perceiver_", i),
partner = paste0(name2, "_partner_", n_video),
paradigm = "Standard Stimulus"
)
out_list[[i]] <- df_i
}
df <- bind_rows(out_list)
df$name <- df$name2
df <- df %>%
select(-name2)
}
return(df)
}
df <- list(
stem = SimulateStudy("stem", paradigm = "DI"),
barter = SimulateStudy("barter", paradigm = "DI"),
deception_detection = SimulateStudy("deception_detection", paradigm = "DI"),
new_moms = SimulateStudy("new_moms",
paradigm = "SS",
n_perceivers = 3,
n_videos_per_perceiver = 3),
karyn_diss = SimulateStudy("karyn_diss",
paradigm = "SS",
n_perceivers = 212,
n_videos_per_perceiver = 3),
murat_rep = SimulateStudy("karyn_diss",
paradigm = "SS",
n_perceivers = 200,
n_videos_per_perceiver = 3),
mideast_men = SimulateStudy("mideast_men",
paradigm = "SS",
n_perceivers = 326,
n_videos_per_perceiver = 4),
double_empathy = SimulateStudy("double_empathy",
paradigm = "SS",
n_perceivers = 100,
n_videos_per_perceiver = 4)
) %>%
bind_rows() %>%
ungroup()
## Warning: Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
## Unknown or uninitialised column: `n_turns`.
df <- df %>%
mutate(across(where(is.character), factor))
avg_data <- df %>%
group_by(turn_percent_through_chapter) %>%
summarize(sem_sim = mean(sem_sim), .groups = "drop")
ggplot(df, aes(x = (turn_percent_through_chapter), y = sem_sim)) +
geom_line(aes(group = perceiver), color = "gray",
alpha = 0.01, size = 0.5) +
geom_hline(aes(yintercept = 0), color = "black") +
geom_smooth(data = avg_data, aes(x = turn_percent_through_chapter,
y = sem_sim),
method = "loess", se = FALSE, color = "black") +
labs(
title = "Turn Distance from Inference by Semantic Similarity",
x = "Proximity to Inference",
y = "Semantic Similarity",
color = "Perceiver"
) +
papaja::theme_apa(
base_family = "Times New Roman"
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
avg_data <- df %>%
group_by(turn_percent_through_chapter, speaker) %>%
summarize(sem_sim = mean(sem_sim), .groups = "drop")
ggplot(df, aes(x = turn_percent_through_chapter, y = sem_sim)) +
geom_line(aes(group = perceiver, color = speaker),
alpha = 0.005, size = 0.5) +
scale_color_manual(
values = c("Partner" = "red", "Target" = "blue"),
name = "Speaker"
) +
geom_hline(aes(yintercept = 0), color = "black") +
# Separate average lines for Target and Partner
geom_smooth(data = avg_data %>% filter(speaker == "Target"),
aes(x = turn_percent_through_chapter, y = sem_sim),
method = "loess", se = FALSE, color = "red") +
geom_smooth(data = avg_data %>% filter(speaker == "Partner"),
aes(x = turn_percent_through_chapter, y = sem_sim),
method = "loess", se = FALSE, color = "blue") +
labs(
title = "Turn Distance from Inference by Semantic Similarity",
x = "Proximity to Inference",
y = "Semantic Similarity",
color = "Speaker"
) +
papaja::theme_apa(
base_family = "Times New Roman"
) +
theme(legend.position = "none")
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
LanguageFigsFunction <- function(df, variable, var_name = "VARIABLE"){
ggplot(df, aes(x = sem_sim, y = !!sym(variable))) +
geom_point(color = "black", alpha = 0.01, size = 0.5) +
geom_smooth(color = "black", method = "lm", se = TRUE) +
theme_apa(base_family = "Times New Roman") +
labs(
title = paste0("Correlation Between\n ",
var_name, " and Semantic Similarity"),
x = "Semantic Similarity",
y = var_name,
caption = paste0("Correlation = ",
round(cor(df["sem_sim"], df[variable]), 2))
)
}
LanguageFigsFunction(df, variable = "cog_processing_language", "Cognitive Processing Language")
## `geom_smooth()` using formula = 'y ~ x'
LanguageFigsFunction(df, variable = "memory_language", "Memory Language")
## `geom_smooth()` using formula = 'y ~ x'
LanguageFigsFunction(df, variable = "emo_anxious_language", "Anxious Emotion")
## `geom_smooth()` using formula = 'y ~ x'
LanguageFigsFunction(df, variable = "emo_sad_language", "Sad Emotion")
## `geom_smooth()` using formula = 'y ~ x'
LanguageFigsFunction(df, variable = "emo_anger_language", "Anger Emotion")
## `geom_smooth()` using formula = 'y ~ x'
LanguageFigsFunction(df, variable = "certain_language", "Certainty Language")
## `geom_smooth()` using formula = 'y ~ x'
LanguageFigsFunction(df, variable = "self_ref_language", "Self-Referential Language")
## `geom_smooth()` using formula = 'y ~ x'
LanguageFigsFunction(df, variable = "curious_language", "Curiousity Language")
## `geom_smooth()` using formula = 'y ~ x'
df %>%
select(sem_sim, cog_processing_language, memory_language,
emo_anxious_language, emo_sad_language, emo_anger_language,
self_ref_language, curious_language, certain_language) %>%
rename("Semantic Similarity" = sem_sim,
'Cognitive Processing' = cog_processing_language,
'Memory' = memory_language,
'Anxious' = emo_anxious_language,
'Sad' = emo_sad_language,
'Anger' = emo_anger_language,
'Certainty' = certain_language,
'Self-Referential' = self_ref_language,
'Curiousity' = curious_language) %>%
cor() %>%
kbl(digits = 2) %>%
kable_classic()
| Semantic Similarity | Cognitive Processing | Memory | Anxious | Sad | Anger | Self-Referential | Curiousity | Certainty | |
|---|---|---|---|---|---|---|---|---|---|
| Semantic Similarity | 1.00 | 0.48 | 0.11 | 0.03 | 0.06 | 0.09 | 0.10 | 0.03 | 0.08 |
| Cognitive Processing | 0.48 | 1.00 | 0.05 | 0.02 | 0.02 | 0.05 | 0.06 | -0.01 | 0.04 |
| Memory | 0.11 | 0.05 | 1.00 | 0.00 | 0.00 | 0.00 | 0.01 | 0.03 | 0.00 |
| Anxious | 0.03 | 0.02 | 0.00 | 1.00 | -0.01 | 0.00 | 0.01 | 0.01 | -0.02 |
| Sad | 0.06 | 0.02 | 0.00 | -0.01 | 1.00 | -0.01 | 0.02 | 0.01 | 0.02 |
| Anger | 0.09 | 0.05 | 0.00 | 0.00 | -0.01 | 1.00 | 0.02 | 0.01 | 0.00 |
| Self-Referential | 0.10 | 0.06 | 0.01 | 0.01 | 0.02 | 0.02 | 1.00 | -0.01 | 0.02 |
| Curiousity | 0.03 | -0.01 | 0.03 | 0.01 | 0.01 | 0.01 | -0.01 | 1.00 | -0.02 |
| Certainty | 0.08 | 0.04 | 0.00 | -0.02 | 0.02 | 0.00 | 0.02 | -0.02 | 1.00 |